from pipeline example described by Pierre-Luc Germain, course sta426 UZH
## data can be downloaded from : http://imlspenticton.uzh.ch/dump/files_for_levesque.tar
# set local path
local.path <- getwd()
setwd(local.path)
#define data we want to look at
sample_names <- list("patient1_HS", "patient1_SCC", "patient2_HS", "patient2_AK")
patient1_HS.path <- file.path("data", "patient1_HS")
patient1_SCC.path <- file.path("data", "patient1_SCC")
patient2_HS.path <- file.path("data", "patient2_HS")
patient2_AK.path <- file.path("data", "patient2_AK")
# read in filtered data and create list of SingleCellExperiment objects
paths <- list(patient1_HS.path, patient1_SCC.path, patient2_HS.path, patient2_AK.path)
sces <- lapply(paths, function(i) read10xCounts(file.path(i, "outs/filtered_feature_bc_matrix"), col.names = TRUE))
split_data <- function(sceo) {
# Split the data, store ADT in alternative experiment
sceo <- splitAltExps(sceo, rowData(sceo)$Type)
# Coerce sparse matrix for ADT into a dense matrix
counts(altExp(sceo)) <- as.matrix(counts(altExp(sceo)))
sceo
}
sces <- lapply(sces, split_data)
# only select the gene expression data, drop the antibody capture
sce.patient1_HS <- sces[[1]]
sce.patient1_SCC <- sces[[2]]
sce.patient2_HS <- sces[[3]]
sce.patient2_AK <- sces[[4]]
# add name to sce for output graph
attr(sce.patient1_HS, "name") <- "Patient1 HS"
attr(sce.patient1_SCC, "name") <- "Patient1 SCC"
attr(sce.patient2_HS, "name") <- "Patient2 HS"
attr(sce.patient2_AK, "name") <- "Patient2 AK"
ADD A NICE TABLE WITH ALL THE MARKER USED (and their source) AND EXPLAINED WHICH WEREN’T FOUND also explain that we decided to add the known markers from the litterature into the group of genes used to produce a low-dimensional representation of the data even when they were not part of the most variable genes
# run PCA
# using known genes
# genes not found in the data : "DCDN" "WISP2" "SEPP1" "TYP1"
# genes linked to pericytevSMC : "ACTA2","TAGLN"
# were removed from the following list
known_markers <- c("DSC3","DSP","LGALS7","KRT5","KRT14","COL17A1","KRT1","KRT10","LOR", "SPINK5","KRT6B","GJB2","GJB6","ATP1B1","MGST1","FASN","DCD","AQP5","LUM","MMP2","SLPI","CTHRC1","MFAP5","TSPAN8","CCL19","APOE","CXCL2","CXCL3","EFEMP1","APCDD1","ID1","WIF1","COL18A1","PTGDS","ASPN","POSTN","GPC3","TNN","SFRP1","RGS5","MYL9","TPM2","RERGL","PECAM1","SELE","CLDN5","VWF","PROX1","LYVE1","HLA-DRA","FCER1G","TYROBP","AIF1","CD1C","CD207","CD68","RNASE1","ITGAX","CD3D","CD3E","CD52","IL7R","DCT","MLANA","PMEL","CDH19","NGFR","UBE2C","PCNA")
sc_PCA <- function(sceo, hvgo, known_markers) {
# only gives index of the first encountered
known_genes <- rownames(sceo)[which(rowData(sceo)$Symbol %in% known_markers)]
# Check that the genes we know are also part of the highly variable genes
idx_notfound <- which(!(known_genes %in% hvgo))
# print the markers that were not found in the data
#known_markers[idx_notfound]
# if some of the known markers were not selected, add them
if (length(idx_notfound) > 0) {
cat("Adding", known_markers[idx_notfound], "to the gene set used for PCA in", attr(sceo, "name"), "\n")
hvgo <- c(hvgo, known_genes[idx_notfound])
}
# using highly variable genes
sceo <- runPCA(sceo, subset_row=hvgo)
# check the variance explained by the PCs:
pc.var <- attr(reducedDim(sceo),"percentVar")
plot(pc.var, xlab="PCs", ylab="% variance explained", main=paste("Variance explained across PCs for", attr(sceo, "name")))
# restrict to the first 20 components:
reducedDim(sceo) <- reducedDim(sceo)[,1:20]
# run TSNE and UMAP based on the PCA:
sceo <- runTSNE(sceo, dimred="PCA")
sceo <- runUMAP(sceo, dimred="PCA")
}
sce.patient1_HS <- sc_PCA(sce.patient1_HS, hvg.patient1_HS, known_markers)
## Adding WIF1 IL7R to the gene set used for PCA in Patient1 HS
sce.patient1_SCC <- sc_PCA(sce.patient1_SCC, hvg.patient1_SCC, known_markers)
## Adding KRT10 CXCL2 TNN SFRP1 AIF1 CD3E NGFR to the gene set used for PCA in Patient1 SCC
sce.patient2_HS <- sc_PCA(sce.patient2_HS, hvg.patient2_HS, known_markers)
## Adding WIF1 AIF1 CD207 ITGAX CD3E to the gene set used for PCA in Patient2 HS
sce.patient2_AK <- sc_PCA(sce.patient2_AK, hvg.patient2_AK, known_markers)
## Adding SFRP1 FCER1G ITGAX CD3E to the gene set used for PCA in Patient2 AK
sc_cluster <- function(sceo, k=30) {
go <- buildKNNGraph(sceo, BNPARAM=AnnoyParam(), use.dimred="PCA", k=k)
sceo$cluster <- as.factor(cluster_louvain(go)$membership)
#plots <- plot_grid( plotTSNE(sceo, colour_by="cluster", text_by="cluster") + ggtitle(attr(sceo, "name")), plotUMAP(sceo, colour_by="cluster", text_by="cluster") + ggtitle(attr(sceo, "name")) )
plots <- plot_grid(
plotTSNE(sceo, colour_by="cluster", text_by="cluster"),
plotUMAP(sceo, colour_by="cluster", text_by="cluster"))
title <- ggdraw() + draw_label(attr(sceo, "name"), fontface='bold')
plots <- plot_grid(title, plots, ncol=1, rel_heights=c(0.1, 1))
print(plots)
return(list(sceo, go))
}
results <- sc_cluster(sce.patient1_HS)
sce.patient1_HS <- results[[1]]
g.patient1_HS <- results[[2]]
results <- sc_cluster(sce.patient1_SCC)
sce.patient1_SCC <- results[[1]]
g.patient1_SCC <- results[[2]]
results <- sc_cluster(sce.patient2_HS)
sce.patient2_HS <- results[[1]]
g.patient2_HS <- results[[2]]
results <- sc_cluster(sce.patient2_AK)
sce.patient2_AK <- results[[1]]
g.patient2_AK <- results[[2]]
# genes not found in our data : "DCDN" "WISP2" "SEPP1" "TYP1"
# were removed from the list below
genes <- list(
# classification: https://www.sciencedirect.com/science/article/pii/S0923181120301985?via%3Dihub#bib0050
# KERATINOCYTE
keratinocyte = c("DSC3","DSP","LGALS7"),
#keratinocyte_basal = c("KRT5","KRT14","COL17A1"),
#keratinocyte_suprabasal = c("KRT1","KRT10"),
#keratinocyte_differentiated = c("LOR", "SPINK5"),
#keratinocyte_ORS = c("KRT6B"),
#keratinocyte_channel = c("GJB2","GJB6","ATP1B1"),
#keratinocyte_sebaceous_gland = c("MGST1","FASN"),
#keratinocyte_sweat_gland = c("DCD","AQP5"),
# FIBROBLAST
fibroblast = c("LUM","MMP2"),
# fibroblast subclassification from: https://www.nature.com/articles/s42003-020-0922-4#
#fibroblast_secretory_reticular = c(SLPI","CTHRC1","MFAP5","TSPAN8"),
#fibroblast_proinflammatory = c("CCL19","APOE","CXCL2","CXCL3","EFEMP1"),
#fibroblast_secretory_papillary = c("APCDD1","ID1","WIF1","COL18A1","PTGDS"),
#fibroblast_mesenchymal = c("ASPN","POSTN","GPC3","TNN","SFRP1"),
# PERICYTE & vSMC
#pericytevSMC = c("ACTA2","TAGLN"), #vSMC : vascular smooth muscel cell
#pericytevSMC_pericyte = c("RGS5"),
#pericytevSMC_vSMC = c("MYL9","TPM2","RERGL"),
# ENDOTHELIAL CELL
endothelial = c("PECAM1","SELE","CLDN5","VWF"),
#endothelial_lymphatic = c("PROX1","LYVE1"),
# MYELOID CELL
myeloid = c("HLA-DRA","FCER1G","TYROBP","AIF1"),
#myeloid_dendritic = c("CD1C"),
#myeloid_langerhans = c("CD207"),
#myeloid_macrophage = c("CD68","RNASE1","ITGAX"),
# LYMPHOCYTE
lymphocyte = c("CD3D","CD3E","CD52","IL7R"),
# MELANOCYTE
melanocyte = c("DCT","MLANA","PMEL"),
# SCHWANN CELL
schwann = c("CDH19","NGFR"),
# MITOTIC CELL
mitotic = c("UBE2C","PCNA")
)
convert_rownames <- function(sceo, go, genes) {
return(paste(rownames(sceo), rowData(sceo)$Symbol, sep = "."))
}
annotate_cells <- function(sceo, go, genes) {
kmo <- lapply(genes, FUN=function(go) grep(paste0(go, "$", collapse="|"), rownames(sceo), value=TRUE))
# TODO: try to modify this so that we don't have to convert the rownames anymore and use
# the marker names save in rowData(sceo)$Symbol directly
#kmo <- lapply(genes, FUN=function(go) grep(paste0(go, "$", collapse="|"), rowData(sceo)$Symbol, value=TRUE))
#print(kmo)
return(list(sceo, kmo))
}
rownames(sce.patient1_HS) <- convert_rownames(sce.patient1_HS, g.patient1_HS, genes)
results <- annotate_cells(sce.patient1_HS, g.patient1_HS, genes)
sce.patient1_HS <- results[[1]]
km.patient1_HS <- results[[2]]
rownames(sce.patient1_SCC) <- convert_rownames(sce.patient1_SCC, g.patient1_SCC, genes)
results <- annotate_cells(sce.patient1_SCC, g.patient1_SCC, genes)
sce.patient1_SCC <- results[[1]]
km.patient1_SCC <- results[[2]]
rownames(sce.patient2_HS) <- convert_rownames(sce.patient2_HS, g.patient2_HS, genes)
results <- annotate_cells(sce.patient2_HS, g.patient2_HS, genes)
sce.patient2_HS <- results[[1]]
km.patient2_HS <- results[[2]]
rownames(sce.patient2_AK) <- convert_rownames(sce.patient2_AK, g.patient2_AK, genes)
results <- annotate_cells(sce.patient2_AK, g.patient2_AK, genes)
sce.patient2_AK <- results[[1]]
km.patient2_AK <- results[[2]]
# mean logcounts by cluster
pseudobulk <- function(sceo, kmo) {
pbo <- aggregateData(sceo, "logcounts", by=c("cluster"), fun="mean")
# TODO : find a way to delete the left annotation which is unreadable
# build a heatmap of the mean logcounts of the known markers:
h <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), annotation_names_row=F, cluster_rows=FALSE, scale="row", main=paste(attr(sceo, "name"),"before markers aggregation"), fontsize_row=6, fontsize_col=10, angle_col = "45")
print(h)
#--- aggregation markers
# we will assign clusters to the cell type whose markers are the most expressed
# we extract the pseudo-bulk counts of the markers for each cluster
mato <- assay(pbo)[unlist(kmo),]
# we aggregate across markers of the same type
mato <- aggregate(t(scale(t(mato))), by=list(type=rep(names(kmo), lengths(kmo))), FUN=sum)
# for each column (cluster), we select the row (cell type) which has the maximum aggregated value
cl2o<- mato[,1][apply(mato[,-1], 2, FUN=which.max)]
# we convert the cells' cluster labels to cell type labels:
sceo$cluster2 <- cl2o[sceo$cluster]
# we aggregate again to pseudo-bulk using the new clusters
pbo <- aggregateData(sceo, "logcounts", by=c("cluster2"), fun="mean")
# we plot again the expression of the markers as a sanity check
# If we want to hide the gene markers show_rownames = FALSE
h1 <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), annotation_names_row=F, cluster_rows=FALSE, scale="row", main=paste(attr(sceo, "name"),"after markers aggregation"), fontsize_row=6, fontsize_col=10, angle_col = "45")
print(h1)
# UMAP plot
p <- plotUMAP(sceo, colour_by="cluster2", text_by="cluster2", point_size=1) + ggtitle(attr(sceo, "name")) + theme(legend.title=element_blank())
print(p)
return(list(sceo, pbo, mato, cl2o))
}
#Patient 1 HS
#km.patient1_HS
#km.patient1_SCC
#rowData(sce.patient1_HS[c("keratinocyte", "fibroblast", "endothelial", "myeloid", )])$Symbol
results <- pseudobulk(sce.patient1_HS, km.patient1_HS)
sce.patient1_HS <- results[[1]]
pb.patient1_HS <- results[[2]]
mat.patient1_HS <- results[[3]]
cl2.patient1_HS <- results[[4]]
#Patient 1 SCC
results <- pseudobulk(sce.patient1_SCC, km.patient1_SCC)
sce.patient1_SCC <- results[[1]]
pb.patient1_SCC <- results[[2]]
mat.patient1_SCC <- results[[3]]
cl2.patient1_SCC <- results[[4]]
#Patient 2 HS
results <- pseudobulk(sce.patient2_HS, km.patient2_HS)
sce.patient2_HS <- results[[1]]
pb.patient2_HS <- results[[2]]
mat.patient2_HS <- results[[3]]
cl2.patient2_HS <- results[[4]]
#Patient 2 AK
results <- pseudobulk(sce.patient2_AK, km.patient2_AK)
sce.patient2_AK <- results[[1]]
pb.patient2_AK <- results[[2]]
mat.patient2_AK <- results[[3]]
cl2.patient2_AK <- results[[4]]
cluster_subtype <- function(sceo, cell_type, known_markers, subtype_markers) {
# select the cell labeled as keratinocytes in the previous step
sceo.sub <- sceo[,sceo$cluster2==cell_type]
sceo.sub <- remove_rare_genes(sceo.sub, 4)
results <- variance_stabilization(sceo.sub, 2000)
sceo.sub <- results[[1]]
hvgo.sub <- results[[2]]
#---- run the pipeline again on that subdataset
sceo.sub <- sc_PCA(sceo.sub, hvgo.sub, known_markers)
#--- clustering
results <- sc_cluster(sceo.sub)
sceo.sub <- results[[1]]
go.sub <- results[[2]]
results <- annotate_cells(sceo.sub, go.sub, subtype_markers)
sceo.sub <- results[[1]]
kmo.sub <- results[[2]]
#---
results <- pseudobulk(sceo.sub, kmo.sub)
sceo.sub <- results[[1]]
pbo.sub <- results[[2]]
mato.sub <- results[[3]]
cl2o.sub <- results[[4]]
return(sceo.sub)
}
known_markers.kera <- c("KRT5","KRT14","COL17A1","KRT1","KRT10","LOR", "SPINK5","KRT6B","GJB2","GJB6","ATP1B1","MGST1","FASN","DCD","AQP5")
#annotate the subclusters
genes.kera <- list(
# classification: https://www.sciencedirect.com/science/article/pii/S0923181120301985?via%3Dihub#bib0050
# KERATINOCYTE
#keratinocyte = c("DSC3","DSP","LGALS7"),
basal = c("KRT5","KRT14","COL17A1"),
suprabasal = c("KRT1","KRT10"),
differentiated = c("LOR", "SPINK5"),
ORS = c("KRT6B"),
channel = c("GJB2","GJB6","ATP1B1"),
sebaceous_gland = c("MGST1","FASN"),
sweat_gland = c("DCD","AQP5")
)
# TODO : find a way to avoid giving both known_merkers.kera and genes.kera since both contain the
# same information in essence. Maybe will be fixed by the first todo
sce.patient1_HS.kera <- cluster_subtype(sce.patient1_HS, "keratinocyte", known_markers.kera, genes.kera)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Adding KRT5 KRT10 AQP5 to the gene set used for PCA in Patient1 HS
sce.patient1_SCC.kera <- cluster_subtype(sce.patient1_SCC, "keratinocyte", known_markers.kera, genes.kera)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|=========== | 15%
|
|============= | 18%
|
|=============== | 21%
|
|================= | 24%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 33%
|
|========================= | 36%
|
|============================ | 39%
|
|============================== | 42%
|
|================================ | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|====================================== | 55%
|
|======================================== | 58%
|
|========================================== | 61%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 82%
|
|=========================================================== | 85%
|
|============================================================== | 88%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Adding FASN to the gene set used for PCA in Patient1 SCC
sce.patient2_HS.kera <- cluster_subtype(sce.patient2_HS, "keratinocyte", known_markers.kera, genes.kera)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 21%
|
|================ | 24%
|
|=================== | 26%
|
|===================== | 29%
|
|======================= | 32%
|
|========================= | 35%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================= | 71%
|
|=================================================== | 74%
|
|====================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================== | 88%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Adding FASN to the gene set used for PCA in Patient2 HS
sce.patient2_AK.kera <- cluster_subtype(sce.patient2_AK, "keratinocyte", known_markers.kera, genes.kera)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 21%
|
|================ | 24%
|
|=================== | 26%
|
|===================== | 29%
|
|======================= | 32%
|
|========================= | 35%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================= | 71%
|
|=================================================== | 74%
|
|====================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================== | 88%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
# need the previous block to be runned (at least until l. 298)
genes.dyn <- list(
# classification: https://www.sciencedirect.com/science/article/pii/S0092867420306723
keratinocyte_basal = "COL17A1",
keratinocyte_cycling = "MKI67",
keratinocyte_differentiating = "KRT1"
)
dynamic_plot <- function(sceo, go, genes) {
kmo <- lapply(genes, FUN=function(go) grep(paste0(go, "$", collapse="|"), rownames(sceo), value=TRUE))
# mean logcounts by cluster:
pbo <- aggregateData(sceo, "logcounts", by=c("cluster"), fun="mean")
lengths(kmo)
h <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), cluster_rows=FALSE, scale="row", main="Before markers aggregation", fontsize_row=6, fontsize_col=10)
h
# we will assign clusters to the cell type whose markers are the most expressed
# we extract the pseudo-bulk counts of the markers for each cluster
mato <- assay(pbo)[unlist(kmo),]
# we aggregate across markers of the same type
mato <- aggregate(t(scale(t(mato))), by=list(type=rep(names(kmo), lengths(kmo))), FUN=sum)
# for each column (cluster), we select the row (cell type) which has the maximum aggregated value
cl3o <- mato[,1][apply(mato[,-1], 2, FUN=which.max)]
# we convert the cells' cluster labels to cell type labels:
sceo$cluster3 <- cl3o[sceo$cluster]
print(sceo)
# we aggregate again to pseudo-bulk using the new clusters
pbo <- aggregateData(sceo, "logcounts", by=c("cluster3"), fun="mean")
# we plot again the expression of the markers as a sanity check
h1 <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), cluster_rows=FALSE, scale="row", main="After markers aggregation", fontsize_row=6, fontsize_col=10) #, scale="row", main="After markers aggregation", fontsize_row=6, fontsize_col=10)
h1
plotUMAP(sceo, colour_by="cluster3", text_by="cluster3", point_size=1)
#---- Histograms
total <- ncol(sceo)
basal <- ncol(sceo[,sceo$cluster3=="keratinocyte_basal"])
cycling <- ncol(sceo[,sceo$cluster3=="keratinocyte_cycling"])
diff <- ncol(sceo[,sceo$cluster3=="keratinocyte_differentiating"])
counts <- c(basal, cycling, diff)
percentage <- counts/total
kera.dyn <- data.frame(names=c("Basal", "Cycling", "Differentiating"), percentage=percentage)
kera.dyn
p <- ggplot(data=kera.dyn, aes(x=names, y=percentage, fill=names)) + geom_bar(stat="identity") + labs(title=paste("Proportion of Keratinocyte states in", attr(sceo, "name")),x="Subtypes", y="Percentage") + ylim(0,1) + theme(legend.position='none')
print(p)
}
dynamic_plot(sce.patient1_HS.kera, g.patient1_HS.kera, genes.dyn)
## class: SingleCellExperiment
## dim: 17383 1354
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(17383): ENSG00000238009.AL627309.1 ENSG00000241860.AL627309.5
## ... ENSG00000278817.AC007325.4 ENSG00000277196.AC007325.2
## rowData names(3): ID Symbol Type
## colnames(1354): AAACCTGCATAGAAAC-1 AAACCTGGTCTCATCC-1 ...
## TTTGTCACATGGTCTA-1 TTTGTCAGTCCTAGCG-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture
dynamic_plot(sce.patient1_SCC.kera, g.patient1_SCC.kera, genes.dyn)
## class: SingleCellExperiment
## dim: 16157 706
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(16157): ENSG00000241860.AL627309.5 ENSG00000237491.LINC01409
## ... ENSG00000277666.AC136352.3 ENSG00000273554.AC136616.1
## rowData names(3): ID Symbol Type
## colnames(706): AAACCTGGTAGATTAG-1 AAACGGGGTCCGTGAC-1 ...
## TTTGCGCGTCGGCATC-1 TTTGGTTAGTCATGCT-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture
dynamic_plot(sce.patient2_HS.kera, g.patient2_HS.kera, genes.dyn)
## class: SingleCellExperiment
## dim: 16623 1907
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(16623): ENSG00000237491.LINC01409 ENSG00000228794.LINC01128
## ... ENSG00000278817.AC007325.4 ENSG00000277196.AC007325.2
## rowData names(3): ID Symbol Type
## colnames(1907): AAACCTGGTCAAACTC-1 AAACGGGAGCCAGTAG-1 ...
## TTTGTCATCTATCCCG-1 TTTGTCATCTCACATT-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture
dynamic_plot(sce.patient2_AK.kera, g.patient2_AK.kera, genes.dyn)
## class: SingleCellExperiment
## dim: 16877 1106
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(16877): ENSG00000241860.AL627309.5 ENSG00000237491.LINC01409
## ... ENSG00000278817.AC007325.4 ENSG00000277196.AC007325.2
## rowData names(3): ID Symbol Type
## colnames(1106): AAACCTGAGAGTTGGC-1 AAACCTGTCCTTGACC-1 ...
## TTTGGTTGTGGTAACG-1 TTTGGTTTCTACCTGC-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture
known_markers.fibro <- c("SLPI","CTHRC1","MFAP5","TSPAN8","CCL19","APOE","CXCL2","CXCL3","EFEMP1","APCDD1","ID1","WIF1","COL18A1","PTGDS","ASPN","POSTN","GPC3","TNN","SFRP1")
genes.fibro <- list(
#fibroblast = c("LUM","MMP2"),
# # fibroblast subclassification from: https://www.nature.com/articles/s42003-020-0922-4#
secretory_reticular = c("SLPI","CTHRC1","MFAP5","TSPAN8"),
proinflammatory = c("CCL19","APOE","CXCL2","CXCL3","EFEMP1"),
secretory_papillary = c("APCDD1","ID1","WIF1","COL18A1","PTGDS"),
mesenchymal = c("ASPN","POSTN","GPC3","TNN","SFRP1")
)
sce.patient1_HS.fibro <- cluster_subtype(sce.patient1_HS, "fibroblast", known_markers.fibro, genes.fibro)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 4%
|
|===== | 8%
|
|======== | 12%
|
|=========== | 15%
|
|============= | 19%
|
|================ | 23%
|
|=================== | 27%
|
|====================== | 31%
|
|======================== | 35%
|
|=========================== | 38%
|
|============================== | 42%
|
|================================ | 46%
|
|=================================== | 50%
|
|====================================== | 54%
|
|======================================== | 58%
|
|=========================================== | 62%
|
|============================================== | 65%
|
|================================================ | 69%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|========================================================= | 81%
|
|=========================================================== | 85%
|
|============================================================== | 88%
|
|================================================================= | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
## Adding ID1 WIF1 SFRP1 to the gene set used for PCA in Patient1 HS
sce.patient1_SCC.fibro <- cluster_subtype(sce.patient1_SCC, "fibroblast", known_markers.fibro, genes.fibro)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========= | 13%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 33%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Adding ID1 to the gene set used for PCA in Patient1 SCC
sce.patient2_HS.fibro <- cluster_subtype(sce.patient2_HS, "fibroblast", known_markers.fibro, genes.fibro)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========= | 13%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 33%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
sce.patient2_AK.fibro <- cluster_subtype(sce.patient2_AK, "fibroblast", known_markers.fibro, genes.fibro)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 4%
|
|===== | 8%
|
|======== | 12%
|
|=========== | 15%
|
|============= | 19%
|
|================ | 23%
|
|=================== | 27%
|
|====================== | 31%
|
|======================== | 35%
|
|=========================== | 38%
|
|============================== | 42%
|
|================================ | 46%
|
|=================================== | 50%
|
|====================================== | 54%
|
|======================================== | 58%
|
|=========================================== | 62%
|
|============================================== | 65%
|
|================================================ | 69%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|========================================================= | 81%
|
|=========================================================== | 85%
|
|============================================================== | 88%
|
|================================================================= | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
## Adding CCL19 to the gene set used for PCA in Patient2 AK
# The following code was used to check which genes were missing from in the data
# the known_markers are from the litterature
data_markers <- rowData(sce.patient1_HS)$Symbol
known_markers <- c("DSC3","DSP","LGALS7","KRT5","KRT14","COL17A1","KRT1","KRT10","LOR", "SPINK5","KRT6B","GJB2","GJB6","ATP1B1","MGST1","FASN","DCD","AQP5","DCDN","LUM","MMP2","WISP2","SLPI","CTHRC1","MFAP5","TSPAN8","CCL19","APOE","CXCL2","CXCL3","EFEMP1","APCDD1","ID1","WIF1","COL18A1","PTGDS","ASPN","POSTN","GPC3","TNN","SFRP1","ACTA2","TAGLN","RGS5","MYL9","TPM2","RERGL","PECAM1","SELE","CLDN5","VWF","PROX1","LYVE1","HLA-DRA","FCER1G","TYROBP","AIF1","CD1C","CD207","CD68","RNASE1","SEPP1","ITGAX","CD3D","CD3E","CD52","IL7R","DCT","MLANA","PMEL","TYP1","CDH19","NGFR","UBE2C","PCNA")
idx_notfound <- which(!(known_markers %in% rowData(sce.patient1_HS)$Symbol))
print(paste("Done:", length(known_markers)-length(idx_notfound), "/", length(known_markers), "markers found"))
cat("Genes missing in dataset: ", known_markers[idx_notfound])